*** How worried should we be? The implications of fabricated survey data for political science
*** Figure A1. Differences in Analyses of Change over Time
*** Must have coefplot, grc1leg2 packages installed

set more off

* set directory to location of dataset in following line
cd "C:\~\Downloads\"


** 2014 Venezuela AB data
use "Venezuela_AmericasBarometer_2014.dta", clear
gen weight1500 = 1
gen wave = 2014

append using "VEN_fraud_data.dta", force
replace wave = 2016 if wave == .
replace wt = 1 if wt == .
replace weight1500 = 1500 / 1489 if wave == 2016
replace upm = upm1 if wave == 2016
replace likelyfraud = 0 if wave < 2016
replace cem_matched = 0 if wave < 2016

** label data for full sample comparisons
* first drop canceled cases that were not likelyfrauds and matched to clean cases
drop if clean_data == 0 & cem_matched != 1
* 1 = fake, 2 = clean matched, 3 = rest of data
gen comparison_groups = 1 if likelyfraud == 1 & cem_matched == 1
replace comparison_groups = 2 if likelyfraud == 0 & cem_matched == 1
replace comparison_groups = 3 if likelyfraud == 0 & cem_matched == 0

* double the "rest of the dataset" (non-matched clean interviews)
gen exp=1
replace exp=2 if comparison_group == 3
expand exp, gen(copy)

* compromised versus clean indicator (1 = clean, 2 = compromised)
gen clean_or_compr = .
replace clean_or_compr = 1 if comparison_group == 2 | copy == 0
replace clean_or_compr = 2 if comparison_group == 1 | copy == 1

gen clean = 1 if clean_or_compr == 1
replace clean = 0 if clean_or_compr == 2
lab define clean_comp_lab 0 "Compromised" 1 "Clean"
lab values clean clean_comp_lab


** trend analyses

* loop over all questions

set more off
postfile qdata str32 question scale using "questions_sc.dta", replace
foreach x of varlist l1 b1 b2 b3 b4 b6 b12 b13 b18 b21 b21a b32 b47a venb11 polz1 ///
ros1 ros4 ing4 eff1 eff2 e5 e15 e3 e16 d1 d2 d3 d4 d5 d6 ///
ls3 soct2 idio2 cp6 cp7 cp8 cp13 cp20 it1 aoj11 aoj12 polz1a m1 pn4 ///
pol1 venesc3 sd2new2 sd3new2 sd6new2 {
	di "`x'"
	qui recode `x' (888888 988888 999999 99 = .)
	qui summ `x'
	post qdata ("`x'") (r(max))
}
*
postclose qdata

postfile qdata str32 question clean_contrast clean_contrast_ll clean_contrast_ul clean_contrast_p dirty_contrast dirty_contrast_ll dirty_contrast_ul dirty_contrast_p using "questions_trends_differences.dta", replace
foreach x of varlist l1 b1 b2 b3 b4 b6 b12 b13 b18 b21 b21a b32 b47a venb11 polz1 ///
ros1 ros4 ing4 eff1 eff2 e5 e15 e3 e16 d1 d2 d3 d4 d5 d6 ///
ls3 soct2 idio2 cp6 cp7 cp8 cp13 cp20 it1 aoj11 aoj12 polz1a m1 pn4 ///
pol1 venesc3 sd2new2 sd3new2 sd6new2 {
	di "`x'"
	*recode `x' (888888 988888 999999 99 = .)
	lp_resc `x', min(0) max(100) sufv(_rs)
	svy: reg `x'_rs i.wave if clean == 1 & wave >= 2014
	matrix a = r(table)
	* contrast
	di a[1,2]
	* ci lower
	di a[5,2]
	* ci upper 
	di a[6,2]
	* p-value
	di a[4,2]
	svy: reg `x'_rs i.wave if clean == 0 & wave >= 2014
	matrix b = r(table)
	* contrast
	di b[1,2]
	* ci lower
	di b[5,2]
	* ci upper 
	di b[6,2]
	* p-value
	di b[4,2]
	* write data
	post qdata ("`x'") (a[1,2]) (a[5,2]) (a[6,2]) (a[4,2]) (b[1,2]) (b[5,2]) (b[6,2]) (b[4,2])
}
*
postclose qdata

use "questions_trends_differences.dta", clear
merge 1:1 question using questions_sc.dta

gen clean_diff_sig = 1 if clean_contrast_p < .05
replace clean_diff_sig = 0 if clean_contrast_p >= .05
gen dirty_diff_sig = 1 if dirty_contrast_p < .05
replace dirty_diff_sig = 0 if dirty_contrast_p >= .05
gen clean_diff_sig90 = 1 if clean_contrast_p < .1
replace clean_diff_sig90 = 0 if clean_contrast_p >= .1
gen dirty_diff_sig90 = 1 if dirty_contrast_p < .1
replace dirty_diff_sig90 = 0 if dirty_contrast_p >= .1

gsort scale -question
gen question_label = "3pt_v" + string(_n) if scale == 3
replace question_label = "4pt_v" + string(_n) if scale == 4
replace question_label = "4pt_v" + string(_n) + "*" if scale == 4 & clean_diff_sig != dirty_diff_sig
replace question_label = "5pt_v" + string(_n) if scale == 5
replace question_label = "7pt_v" + string(_n) if scale == 7
replace question_label = "7pt_v" + string(_n) + "*" if scale == 7 & clean_diff_sig != dirty_diff_sig
replace question_label = "10pt_v" + string(_n) if scale == 10
replace question_label = "10pt_v" + string(_n) + "*" if scale == 10 & clean_diff_sig != dirty_diff_sig

tab clean_diff_sig dirty_diff_sig
*tab clean_diff_sig90 dirty_diff_sig90

keep if _merge == 3
drop _merge

gen q_id = _n
labmask q_id, values(question_label)
gen q_id_dirty = q_id + .3

* Figure A1
twoway (scatter q_id clean_contrast if q_id < 25, msymbol(circle) mcolor(black)) ///
(pcbarrow q_id clean_contrast_ll  q_id clean_contrast_ul if q_id < 25,  mcolor(none) lcolor(black) lwidth(medthick)) ///
(scatter q_id_dirty dirty_contrast if q_id < 25, msymbol(circle_hollow) mcolor(red)) ///
(pcbarrow q_id_dirty dirty_contrast_ll  q_id_dirty dirty_contrast_ul if q_id < 25, mcolor(none) lcolor(red) lpattern(dash) lwidth(medthick)), ///
scheme(plotplain) ylabel(1(1)24, valuelabel angle(horizontal) labsize(small)) xline(0) ytitle("Question") ///
legend(label(1 "Clean") label(3 "Compromised") order(1 3) row(1)) saving(trend1.gph, replace)
twoway (scatter q_id clean_contrast if q_id >= 25 & q_id < 50, msymbol(circle) mcolor(black)) ///
(pcbarrow q_id clean_contrast_ll  q_id clean_contrast_ul if q_id >= 25 & q_id < 50,  mcolor(none) lcolor(black) lwidth(medthick)) ///
(scatter q_id_dirty dirty_contrast if q_id >= 25 & q_id < 50, msymbol(circle_hollow) mcolor(red)) ///
(pcbarrow q_id_dirty dirty_contrast_ll  q_id_dirty dirty_contrast_ul if q_id >= 25 & q_id < 50, mcolor(none) lcolor(red) lpattern(dash) lwidth(medthick)), ///
scheme(plotplain) ylabel(25(1)49, valuelabel angle(horizontal) labsize(small)) xline(0) ytitle("Question") ///
legend(label(1 "Clean") label(3 "Compromised") order(1 3) row(1)) saving(trend2.gph, replace)
grc1leg2 trend1.gph trend2.gph, row(1) scheme(plotplain)

